The Dataset was downloaded from Kaggle.com (https://www.kaggle.com/shahir/protein-data-set)
The dataset had 2 files 1. 141401 rows, 14 columns 2. 467304 rows, 5 columns
knitr::opts_chunk$set(echo = TRUE)
library(knitr)
library(ggplot2)
library(readr)
library(tidyverse)
## -- Attaching packages ----------------------------------------- tidyverse 1.2.1 --
## v tibble 2.1.3 v dplyr 0.8.3
## v tidyr 1.0.0 v stringr 1.4.0
## v purrr 0.3.2 v forcats 0.4.0
## -- Conflicts -------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(rio)
library(dplyr)
library(stringr)
library(tableone)
library(visdat)
library(plotly)
##
## Attaching package: 'plotly'
## The following object is masked from 'package:rio':
##
## export
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
pdb_data_seq1 <- read_csv("Data/pdb_data_seq.csv")
pdb_data_seq2 <- pdb_data_seq1[!duplicated(pdb_data_seq1$structureId), ]
pdb_data_no_dups <- read_csv("Data/pdb_data_no_dups.csv")
PDB <- left_join(pdb_data_no_dups,pdb_data_seq2, by=c("structureId"))
a <- vis_dat(PDB, warn_large_data = FALSE)
a
PDB1 <- select(PDB, everything(), -macromoleculeType.y, -residueCount.y)
PDB2 <- rename(PDB1, macromoleculeType = macromoleculeType.x, residueCount=residueCount.x )
PDB3 <- PDB2 %>%
arrange(publicationYear)
PDB4 <- PDB3[-1,]
PDB4 <- mutate(PDB4, experimentalTechnique=as.factor(experimentalTechnique))
PDB4 <- mutate(PDB4, classification=as.factor(classification))
PDB4 <- mutate(PDB4, publicationYear=as.factor(publicationYear))
PDB4 <- mutate(PDB4, macromoleculeType=as.factor(macromoleculeType))
Mising values in the publocation year column was removed
PDB5 <- dplyr::filter(PDB4, !is.na(publicationYear))
p3 <- ggplot(PDB5, aes(x= publicationYear))+
geom_bar(aes(fill=experimentalTechnique),show.legend = T)+theme_classic()+
theme(axis.text.x = element_text(angle = 90, size = 5), legend.position = c(0.4,0.8), legend.text=element_text(size=5), legend.key.size = unit(1, "mm"))+xlab("Year of Publication")
p3
Missing values in experimentalTechnique and resolution was removed
PDB5 <- dplyr::filter(PDB4, !is.na(experimentalTechnique))
PDB6 <- dplyr::filter(PDB5, !is.na(resolution))
p1 <- ggplot(PDB6, aes(x=experimentalTechnique, y=resolution))+
geom_jitter(aes(colour=macromoleculeType), size=1, alpha=0.5)+theme_classic()+
theme(axis.text.x = element_text(angle = 90, size = 7),legend.position = c(0.9,0.7), legend.text=element_text(size=5), legend.key.size = unit(1, "mm"))+ xlab('Experimental Technique')
p1
Resolution vs molecular weight for structures published in the year 2017 Missing values for molecular weigh, resolution and macromolecu type was removed
PDB7 <- dplyr::filter(PDB4, resolution<=10)
PDB7 <- dplyr::filter(PDB7, publicationYear==2017)
PDB7 <- dplyr::filter(PDB7, !is.na(structureMolecularWeight))
PDB7 <- dplyr::filter(PDB7, !is.na(resolution))
PDB7 <- dplyr::filter(PDB7, !is.na(macromoleculeType))
p1 <- ggplot(PDB7, aes(x=structureMolecularWeight, y=resolution, label= classification))+
geom_point(aes(colour=macromoleculeType), alpha=0.5)+
theme(legend.position = c(0.4,0.8), legend.text=element_text(size=5), legend.key.size = unit(1, "mm")) + xlab ("Molecular Weight")
p1 <- ggplotly(p1, dynamicTicks = TRUE) %>%
rangeslider(start=0) %>%
layout(hovermode = "x")
p1